From 4054de8477b3ba4addb69cf9783dfb611f60faa6 Mon Sep 17 00:00:00 2001 From: Daniel Sabo Date: Sat, 30 Mar 2013 08:48:52 -0700 Subject: [PATCH] Add SSE2 conversions This patch includes two conversions for RaGaBaA -> RGBA. Depending on the CPU either spin or shuffle is significantly faster. Unless I can find a consistently fast version I'm going to let them fight it out in the babl startup benchmarks. --- configure.ac | 29 +++- extensions/Makefile.am | 18 ++- extensions/sse2-float.c | 299 ++++++++++++++++++++++++++++++++++++++++ extensions/sse2-int16.c | 186 +++++++++++++++++++++++++ 4 files changed, 525 insertions(+), 7 deletions(-) create mode 100644 extensions/sse2-float.c create mode 100644 extensions/sse2-int16.c diff --git a/configure.ac b/configure.ac index ce5a872..296ec27 100644 --- a/configure.ac +++ b/configure.ac @@ -294,9 +294,14 @@ AC_ARG_ENABLE(sse, [ --enable-sse enable SSE support (default=auto)],, enable_sse=$enable_mmx) +AC_ARG_ENABLE(sse2, + [ --enable-sse2 enable SSE2 support (default=auto)],, + enable_sse2=$enable_sse) + if test "x$enable_mmx" = xyes; then BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx') SSE_EXTRA_CFLAGS= + SSE2_EXTRA_CFLAGS= AC_MSG_CHECKING(whether we can compile MMX code) @@ -309,8 +314,11 @@ if test "x$enable_mmx" = xyes; then AC_MSG_RESULT(yes) if test "x$enable_sse" = xyes; then + BABL_DETECT_CFLAGS(fpmath_flag, '-mfpmath=sse') + SSE_EXTRA_CFLAGS="$MMX_EXTRA_CFLAGS $fpmath_flag" + BABL_DETECT_CFLAGS(sse_flag, '-msse') - SSE_EXTRA_CFLAGS="$MMX_EXTRA_CFLAGS $sse_flag" + SSE_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse_flag" AC_MSG_CHECKING(whether we can compile SSE code) @@ -325,6 +333,24 @@ if test "x$enable_mmx" = xyes; then AC_MSG_WARN([The assembler does not support the SSE command set.]) ) + if test "x$enable_sse2" = xyes; then + BABL_DETECT_CFLAGS(sse2_flag, '-msse2') + SSE2_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse2_flag" + + AC_MSG_CHECKING(whether we can compile SSE2 code) + + CFLAGS="$CFLAGS $sse2_flag" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("punpckhwd %xmm0,%xmm1");])], + AC_DEFINE(USE_SSE2, 1, [Define to 1 if SSE2 assembly is available.]) + AC_MSG_RESULT(yes) + , + enable_sse2=no + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the SSE2 command set.]) + ) + fi + fi , enable_mmx=no @@ -336,6 +362,7 @@ if test "x$enable_mmx" = xyes; then AC_SUBST(MMX_EXTRA_CFLAGS) AC_SUBST(SSE_EXTRA_CFLAGS) + AC_SUBST(SSE2_EXTRA_CFLAGS) fi diff --git a/extensions/Makefile.am b/extensions/Makefile.am index 2636f17..30ac8c5 100644 --- a/extensions/Makefile.am +++ b/extensions/Makefile.am @@ -21,16 +21,18 @@ ext_LTLIBRARIES = \ gggl-lies.la \ gggl.la \ gimp-8bit.la \ - float.la \ - fast-float.la \ + float.la \ + fast-float.la \ naive-CMYK.la \ - HSV.la \ + HSV.la \ simple.la \ - sse-fixups.la + sse-fixups.la \ + sse2-float.la \ + sse2-int16.la cairo_la_SOURCES = cairo.c CIE_la_SOURCES = CIE.c -expar_la_SOURCES = expar.c +simple_la_SOURCES = simple.c gegl_fixups_la_SOURCES = gegl-fixups.c gggl_lies_la_SOURCES = gggl-lies.c gggl_la_SOURCES = gggl.c @@ -38,9 +40,13 @@ gimp_8bit_la_SOURCES = gimp-8bit.c naive_CMYK_la_SOURCES = naive-CMYK.c HSV_la_SOURCES = HSV.c sse_fixups_la_SOURCES = sse-fixups.c +sse2_float_la_SOURCES = sse2-float.c +sse2_int16_la_SOURCES = sse2-int16.c float_la_SOURCES = float.c fast_float_la_SOURCES = fast-float.c LIBS = $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la $(MATH_LIB) -sse_fixups_la_CFLAGS = $(MMX_EXTRA_CFLAGS) $(SSE_EXTRA_CFLAGS) +sse_fixups_la_CFLAGS = $(SSE_EXTRA_CFLAGS) +sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) +sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c new file mode 100644 index 0000000..954e359 --- /dev/null +++ b/extensions/sse2-float.c @@ -0,0 +1,299 @@ +/* babl - dynamically extendable universal pixel conversion library. + * Copyright (C) 2013 Massimo Valentini + * Copyright (C) 2013 Daniel Sabo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General + * Public License along with this library; if not, see + * . + */ + +#include "config.h" + +#if defined(USE_SSE2) + +/* SSE 2 */ +#include + +#include +#include + +#include "babl.h" +#include "babl-cpuaccel.h" +#include "base/util.h" +#include "extensions/util.h" + +#define Q(a) { a, a, a, a } + +static const float BABL_ALPHA_THRESHOLD_FLOAT = (float)BABL_ALPHA_THRESHOLD; + +static long +conv_rgbaF_linear_rgbAF_linear (const float *src, float *dst, long samples) +{ + long i = 0; + long remainder; + + if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) + { + const long n = (samples / 2) * 2; + const __v4sf *s = (const __v4sf*) src; + __v4sf *d = (__v4sf*)dst; + + for ( ; i < n; i += 2) + { + __v4sf rbaa0, rbaa1; + + __v4sf rgba0 = *s++; + __v4sf rgba1 = *s++; + + /* Expand alpha */ + __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3)); + __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3)); + + /* Premultiply */ + rgba0 = rgba0 * aaaa0; + rgba1 = rgba1 * aaaa1; + + /* Shuffle the original alpha value back in */ + rbaa0 = _mm_shuffle_ps(rgba0, aaaa0, _MM_SHUFFLE(0, 0, 2, 0)); + rbaa1 = _mm_shuffle_ps(rgba1, aaaa1, _MM_SHUFFLE(0, 0, 2, 0)); + + rgba0 = _mm_shuffle_ps(rgba0, rbaa0, _MM_SHUFFLE(2, 1, 1, 0)); + rgba1 = _mm_shuffle_ps(rgba1, rbaa1, _MM_SHUFFLE(2, 1, 1, 0)); + + *d++ = rgba0; + *d++ = rgba1; + } + _mm_empty (); + } + + dst += i * 4; + src += i * 4; + remainder = samples - i; + while (remainder--) + { + const float a = src[3]; + dst[0] = src[0] * a; + dst[1] = src[1] * a; + dst[2] = src[2] * a; + dst[3] = a; + + src += 4; + dst += 4; + } + + return samples; +} + +static long +conv_rgbAF_linear_rgbaF_linear_shuffle (const float *src, float *dst, long samples) +{ + long i = 0; + long remainder; + + if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) + { + const long n = samples; + const __v4sf *s = (const __v4sf*) src; + __v4sf *d = (__v4sf*)dst; + + for ( ; i < n; i += 1) + { + __v4sf pre_rgba0, rgba0, rbaa0, raaaa0; + + float alpha0 = ((float *)s)[3]; + pre_rgba0 = *s; + + if (alpha0 <= 0.0f) + { + /* Zero RGB */ + rgba0 = _mm_setzero_ps(); + } + else + { + float recip0 = 1.0f/alpha0; + + /* Expand reciprocal */ + raaaa0 = _mm_load1_ps(&recip0); + + /* Un-Premultiply */ + rgba0 = pre_rgba0 * raaaa0; + } + + /* Shuffle the original alpha value back in */ + rbaa0 = _mm_shuffle_ps(rgba0, pre_rgba0, _MM_SHUFFLE(3, 3, 2, 0)); + rgba0 = _mm_shuffle_ps(rgba0, rbaa0, _MM_SHUFFLE(2, 1, 1, 0)); + + s++; + *d++ = rgba0; + } + _mm_empty (); + } + + dst += i * 4; + src += i * 4; + remainder = samples - i; + while (remainder--) + { + float alpha = src[3]; + float recip; + if (alpha <= 0.0f) + recip = 0.0f; + else + recip = 1.0f/alpha; + dst[0] = src[0] * recip; + dst[1] = src[1] * recip; + dst[2] = src[2] * recip; + dst[3] = alpha; + + src += 4; + dst += 4; + } + + return samples; +} + +static long +conv_rgbAF_linear_rgbaF_linear_spin (const float *src, float *dst, long samples) +{ + long i = 0; + long remainder; + + if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) + { + const long n = samples; + const __v4sf *s = (const __v4sf*) src; + __v4sf *d = (__v4sf*)dst; + const __v4sf zero = _mm_setzero_ps(); + const __v4sf one = _mm_set_ss(1.0f); + + for ( ; i < n; i += 1) + { + __v4sf pre_abgr0, abgr0, rgba0, raaaa0; + + + rgba0 = *s; + /* Rotate to ABGR */ + pre_abgr0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(0, 1, 2, 3)); + + if (_mm_ucomile_ss(pre_abgr0, zero)) + { + /* Zero RGB */ + abgr0 = zero; + } + else + { + /* Un-Premultiply */ + raaaa0 = _mm_div_ss(one, pre_abgr0); + + /* Expand reciprocal */ + raaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)raaaa0, _MM_SHUFFLE(0, 0, 0, 0)); + + /* Un-Premultiply */ + abgr0 = pre_abgr0 * raaaa0; + } + + /* Move the original alpha value back in */ + abgr0 = _mm_move_ss(abgr0, pre_abgr0); + + /* Rotate to ABGR */ + rgba0 = (__v4sf)_mm_shuffle_epi32((__m128i)abgr0, _MM_SHUFFLE(0, 1, 2, 3)); + + *d++ = rgba0; + s++; + } + _mm_empty (); + } + + dst += i * 4; + src += i * 4; + remainder = samples - i; + while (remainder--) + { + float alpha = src[3]; + float recip; + if (alpha <= 0.0f) + recip = 0.0f; + else + recip = 1.0f/alpha; + dst[0] = src[0] * recip; + dst[1] = src[1] * recip; + dst[2] = src[2] * recip; + dst[3] = alpha; + + src += 4; + dst += 4; + } + + return samples; +} + +#endif /* defined(USE_SSE2) */ + +#define o(src, dst) \ + babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL) + +int init (void); + +int +init (void) +{ +#if defined(USE_SSE2) + + const Babl *rgbaF_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + const Babl *rgbAF_linear = babl_format_new ( + babl_model ("RaGaBaA"), + babl_type ("float"), + babl_component ("Ra"), + babl_component ("Ga"), + babl_component ("Ba"), + babl_component ("A"), + NULL); + + if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) && + (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2)) + + { + babl_conversion_new(rgbaF_linear, + rgbAF_linear, + "linear", + conv_rgbaF_linear_rgbAF_linear, + NULL); + + /* Which of these is faster varies by CPU, and the difference + * is big enough that it's worthwhile to include both and + * let them fight it out in the babl benchmarks. + */ + babl_conversion_new(rgbAF_linear, + rgbaF_linear, + "linear", + conv_rgbAF_linear_rgbaF_linear_shuffle, + NULL); + babl_conversion_new(rgbAF_linear, + rgbaF_linear, + "linear", + conv_rgbAF_linear_rgbaF_linear_spin, + NULL); + } + +#endif /* defined(USE_SSE2) */ + + return 0; +} + diff --git a/extensions/sse2-int16.c b/extensions/sse2-int16.c new file mode 100644 index 0000000..252d1a7 --- /dev/null +++ b/extensions/sse2-int16.c @@ -0,0 +1,186 @@ +/* babl - dynamically extendable universal pixel conversion library. + * Copyright (C) 2013 Massimo Valentini + * Copyright (C) 2013 Daniel Sabo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General + * Public License along with this library; if not, see + * . + */ + +#include "config.h" + +#if defined(USE_SSE2) + +/* SSE 2 */ +#include + +#include +#include + +#include "babl.h" +#include "babl-cpuaccel.h" +#include "extensions/util.h" + +#define Q(a) { a, a, a, a } +static const __v4sf u16_float = Q (1.f / 65535); + +static long +conv_rgba16_linear_rgbaF_linear (const uint16_t *src, float *dst, long samples) +{ + long i = 0; + + if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) + { + long n = (samples / 2) * 2; + const __m128i *s = (const __m128i*) src; + __v4sf *d = (__v4sf*) dst; + + for (; i < n / 2; i++) + { + /* Expand shorts to ints by loading zero in the high bits */ + const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); + const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); + + /* Convert to float */ + const __m128 u0 = _mm_cvtepi32_ps (t0); + const __m128 u1 = _mm_cvtepi32_ps (t1); + + const __v4sf rgba0 = u0 * u16_float; + const __v4sf rgba1 = u1 * u16_float; + + d[2 * i + 0] = rgba0; + d[2 * i + 1] = rgba1; + } + _mm_empty(); + } + + for (i *= 2 * 4; i != 4 * samples; i++) + dst[i] = src[i] * (1.f / 65535); + + return samples; +} + +static long +conv_rgba16_linear_rgbAF_linear (const uint16_t *src, float *dst, long samples) +{ + long i = 0; + long remainder; + + if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) + { + long n = (samples / 2) * 2; + const __m128i *s = (const __m128i*) src; + __v4sf *d = (__v4sf*) dst; + + const __v4sf max_mask = { 0.0f, 0.0f, 0.0f, 1.0f }; + + for (; i < n / 2; i++) + { + /* Expand shorts to ints by loading zero in the high bits */ + const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); + const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); + + /* Convert to float */ + const __m128 u0 = _mm_cvtepi32_ps (t0); + const __m128 u1 = _mm_cvtepi32_ps (t1); + + /* Multiply by 1 / 65535 */ + __v4sf rgba0 = u0 * u16_float; + __v4sf rgba1 = u1 * u16_float; + + /* Expand alpha */ + __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3)); + __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3)); + + /* Set the value in the alpha slot to 1.0, we know max is sufficent because alpha was a short */ + aaaa0 = _mm_max_ps(aaaa0, max_mask); + aaaa1 = _mm_max_ps(aaaa1, max_mask); + + /* Premultiply */ + rgba0 = rgba0 * aaaa0; + rgba1 = rgba1 * aaaa1; + + d[2 * i + 0] = rgba0; + d[2 * i + 1] = rgba1; + } + _mm_empty(); + } + + dst += i * 2 * 4; + src += i * 2 * 4; + remainder = samples - (i * 2); + while (remainder--) + { + const float a = src[3] / 65535.0f; + const float a_term = a / 65535.0f; + dst[0] = src[0] * a_term; + dst[1] = src[1] * a_term; + dst[2] = src[2] * a_term; + dst[3] = a; + + src += 4; + dst += 4; + } + + return samples; +} + +#endif /* defined(USE_SSE2) */ + +#define o(src, dst) \ + babl_conversion_new (src, dst, "linear", conv_ ## src ## _ ## dst, NULL) + +int init (void); + +int +init (void) +{ +#if defined(USE_SSE2) + + const Babl *rgbaF_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + const Babl *rgbAF_linear = babl_format_new ( + babl_model ("RaGaBaA"), + babl_type ("float"), + babl_component ("Ra"), + babl_component ("Ga"), + babl_component ("Ba"), + babl_component ("A"), + NULL); + const Babl *rgba16_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("u16"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + + if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE) && + (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2)) + { + o (rgba16_linear, rgbaF_linear); + o (rgba16_linear, rgbAF_linear); + } + +#endif /* defined(USE_SSE2) */ + + return 0; +} + -- 2.30.2